df1 = pd.read_csv('Life Expectancy Data.csv')
df2 = pd.read_csv('Life_Expectancy_00_15.csv')
df1.sample(3)
| Country | Year | Status | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | ... | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 166 | Bahamas | 2009 | Developing | 74.6 | 168.0 | 0 | 9.29 | 0.000000 | 95.0 | 0 | ... | 97.0 | 7.43 | 96.0 | 0.1 | NaN | NaN | 2.5 | 2.5 | 0.791 | 12.6 |
| 198 | Bangladesh | 2009 | Developing | 69.5 | 144.0 | 135 | 0.01 | 53.264004 | 97.0 | 718 | ... | 97.0 | 2.91 | 97.0 | 0.1 | 681.125368 | 1545478.0 | 19.1 | 19.7 | 0.523 | 8.4 |
| 176 | Bahrain | 2015 | Developing | 76.9 | 69.0 | 0 | NaN | 0.000000 | 98.0 | 0 | ... | 98.0 | NaN | 98.0 | 0.1 | 22688.878240 | NaN | 6.2 | 6.1 | 0.823 | 14.5 |
df2.sample(3)
| Country | Year | Continent | Least Developed | Life Expectancy | Population | CO2 emissions | Health expenditure | Electric power consumption | Forest area | GDP per capita | Individuals using the Internet | Military expenditure | People practicing open defecation | People using at least basic drinking water services | Obesity among adults | Beer consumption per capita | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1854 | Uruguay | 2014 | South America | False | 77.244 | 3400439 | 1.911518 | 8.073218 | 3001.384301 | 10.754542 | 20093.62407 | 61.460000 | 1.807429 | 0.659960 | 98.993558 | 27.8 | 1.94 |
| 908 | Kazakhstan | 2012 | Asia | False | 69.520 | 16792090 | 14.566383 | 3.037260 | 4892.734896 | 1.175201 | 22032.17454 | 61.906627 | 1.046906 | 0.030528 | 94.338501 | 18.7 | 1.91 |
| 95 | Australia | 2015 | Oceania | False | 82.400 | 23815995 | 15.863288 | 9.327589 | 10071.398980 | 17.324825 | 46248.61646 | 84.560515 | 1.951028 | 0.000000 | 99.970006 | 29.8 | 3.76 |
df_vis = pd.merge(df1, df2, how='outer', on=['Country','Year'])
df_vis.to_csv('merged_life_expectancy_test.csv')
# Pays de df1 n'ayant de données que pour l'année 2013
df1_2013_only_countries = [
'Dominica',
'Cook Islands',
'Marshall Islands',
'Monaco',
'Nauru',
'Niue',
'Palau',
'Saint Kitts and Nevis',
'San Marino',
'Tuvalu'
]
# Pays uniquement présents dans df1 (sans compter df1_2013_only_countries qui ne sont pas dans df2 non plus)
df1_only_countries_list = [
'Afghanistan', 'Antigua and Barbuda', 'Azerbaijan', 'Bahamas', 'Barbados',
'Belize', 'Bhutan', 'Bosnia and Herzegovina', 'Burkina Faso', 'Burundi',
'Cabo Verde', 'Central African Republic', 'Chad', 'Comoros', 'Cuba',
"Democratic People's Republic of Korea", 'Republic of Korea', 'Djibouti',
'Dominican Republic', 'Equatorial Guinea', 'Fiji', 'Gambia', 'Grenada',
'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Iceland', 'Japan', 'Kiribati',
"Lao People's Democratic Republic", 'Lebanon', 'Lesotho', 'Liberia',
'Madagascar', 'Malawi', 'Maldives', 'Mali', 'Mauritania',
'Micronesia (Federated States of)', 'Papua New Guinea', 'Rwanda',
'Saint Lucia', 'Saint Vincent and the Grenadines', 'Samoa',
'Sao Tome and Principe', 'Seychelles', 'Sierra Leone', 'Singapore',
'Solomon Islands', 'Somalia', 'South Sudan', 'Suriname', 'Swaziland',
'Syrian Arab Republic', 'Timor-Leste', 'Tonga', 'Turkmenistan', 'Uganda',
'United Arab Emirates', 'Uzbekistan', 'Vanuatu', 'Viet Nam', 'Zambia'
]
# Pays dans df1 et df2 mais dont le nom est différent (la clé correspond au nom dans df1 et l'argument au nom dans df2)
country_names = {
'Bolivia (Plurinational State of)' : 'Bolivia',
'Congo' : 'Congo Rep.',
'Democratic Republic of the Congo' : 'Congo Dem. Rep.',
"Côte d'Ivoire" : "Cote d'Ivoire",
'Iran (Islamic Republic of)' : 'Iran',
'Kyrgyzstan' : 'Kyrgyz Republic',
'Slovakia' : 'Slovak Republic',
'United Republic of Tanzania' : 'Tanzania',
'Turkey' : 'Turkiye',
'United Kingdom of Great Britain and Northern Ireland' : 'United Kingdom',
'United States of America' : 'United States',
'Venezuela (Bolivarian Republic of)' : 'Venezuela',
'The former Yugoslav republic of Macedonia' : 'North Macedonia',
'Republic of Moldova' : 'Moldova'
}
df1_clean = df1.copy()
df1_clean = df1_clean[~df1_clean['Country'].isin(df1_2013_only_countries+df1_only_countries_list)]
df1_clean['Country'] = df1_clean['Country'].apply(lambda name: country_names[name] if name in country_names else name)
df = pd.merge(df1_clean, df2, how='outer', on=['Country','Year'])
df.sample(3)
| Country | Year | Status | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | ... | Health expenditure | Electric power consumption | Forest area | GDP per capita | Individuals using the Internet | Military expenditure | People practicing open defecation | People using at least basic drinking water services | Obesity among adults | Beer consumption per capita | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1167 | Myanmar | 2000 | Developing | 62.1 | 243.0 | 73 | 0.35 | 2.511437 | NaN | 845 | ... | 1.995493 | 68.101490 | 53.352679 | 824.962767 | 0.000168 | 2.001199 | 10.111872 | 46.975984 | 1.9 | 0.10454 |
| 1028 | Malaysia | 2011 | Developing | 74.3 | 13.0 | 3 | 0.51 | 9.230026 | 96.0 | 1569 | ... | 3.314519 | 4145.521977 | 57.984976 | 21704.744270 | 61.000000 | 1.552853 | 0.374551 | 97.023887 | 12.1 | 0.39000 |
| 1137 | Mozambique | 2014 | Developing | 56.7 | 375.0 | 61 | 0.01 | 54.911595 | 79.0 | 9 | ... | 6.332318 | 451.043068 | 48.508899 | 1139.051782 | 6.000000 | 0.980759 | 32.294620 | 48.825319 | 5.5 | 0.84000 |
df.to_csv('merged_life_expectancy.csv')
pd.set_option('display.min_rows', 20)
tdf = (df.count()/df.shape[0]*100).astype(int).to_frame().rename({0 : 'Remplissage (%)'}, axis=1)
tdf_vis = pd.DataFrame({
'Colonnes (1-19)' : tdf.head(19).index,
'(%)' : tdf['Remplissage (%)'].head(19).values.tolist(),
' ' : [' ']*19,
'|' : ['|']*19,
'Colonnes (20-37)' : tdf.tail(18).index.tolist()+[' '],
'(%) ' : tdf['Remplissage (%)'].tail(18).values.tolist()+[' ']})
tdf_vis.style.hide_index().applymap(lambda x: set_color(x))
| Colonnes (1-19) | (%) | | | Colonnes (20-37) | (%) | |
|---|---|---|---|---|---|
| Country | 100 | | | thinness 5-9 years | 99 | |
| Year | 100 | | | Income composition of resources | 94 | |
| Status | 100 | | | Schooling | 94 | |
| Life expectancy | 100 | | | Continent | 100 | |
| Adult Mortality | 100 | | | Least Developed | 100 | |
| infant deaths | 100 | | | Life Expectancy | 100 | |
| Alcohol | 93 | | | Population_y | 100 | |
| percentage expenditure | 100 | | | CO2 emissions | 100 | |
| Hepatitis B | 82 | | | Health expenditure | 100 | |
| Measles | 100 | | | Electric power consumption | 100 | |
| BMI | 99 | | | Forest area | 100 | |
| under-five deaths | 100 | | | GDP per capita | 100 | |
| Polio | 99 | | | Individuals using the Internet | 100 | |
| Total expenditure | 93 | | | Military expenditure | 100 | |
| Diphtheria | 99 | | | People practicing open defecation | 100 | |
| HIV/AIDS | 100 | | | People using at least basic drinking water services | 100 | |
| GDP | 85 | | | Obesity among adults | 100 | |
| Population_x | 79 | | | Beer consumption per capita | 100 | |
| thinness 1-19 years | 99 | | |
plt.figure(figsize=(14,8))
sns.histplot(df['Life expectancy '], kde=True, color='r', label='df1')
sns.histplot(df['Life Expectancy'], kde=True, label='df2')
plt.title("Histogrammes de l'espérance de vie selon df1 et df2", fontsize=20, pad=15)
plt.legend(fontsize=20);
df.drop(columns='Life expectancy ', inplace=True)
y = df['Life Expectancy']
plt.figure(figsize=(22,18))
sns.heatmap(corr, annot=True, fmt='0.1g');
potential_features = [ # | Score | Commentaire sur la pertinence | Gardé |
# |_______|___________________________________|_______|
'People using at least basic drinking water services', # | 0.82 | | Oui |
'Income composition of resources', # | 0.81 | | Oui |
'Schooling', # | 0.75 | | Oui |
'Adult Mortality', # | -0.73 | Valeurs aberrantes | Non |
'Continent_Africa', # | -0.73 | Feature immuable | Non |
'Individuals using the Internet', # | 0.70 | Corrélation mais pas de causalité | Non |
'Obesity among adults', # | 0.69 | Corrélation mais pas de causalité | Non |
'People practicing open defecation', # | -0.67 | Trop spécifique ? | Non |
' HIV/AIDS', # | -0.60 | | Oui |
' BMI ', # | 0.59 | Corrélation mais pas de causalité | Non |
'GDP per capita', # | 0.58 | | Oui |
'Electric power consumption', # | 0.56 | Corrélation mais pas de causalité | Non |
'Least Developed', # | -0.55 | A définir | Non |
' thinness 5-9 years', # | -0.54 | | Oui |
'Status_Developed', # | 0.49 | | Oui |
'Continent_Europe', # | 0.48 | Feature immuable | Non |
'Diphtheria ', # | 0.48 | Valeurs aberrantes | Non |
'Health expenditure', # | 0.47 | | Oui |
'Polio', # | 0.47 | Valeurs aberrantes | Non |
'CO2 emissions', # | 0.46 | Corrélation mais pas de causalité | Non |
'GDP', # | 0.44 | | Oui |
'Beer consumption per capita', # | 0.42 | Corrélation mais pas de causalité | Non |
'Alcohol', # | 0.41 | Corrélation mais pas de causalité | Non |
'percentage expenditure' # | 0.38 | A définir | Non |
# |_______|___________________________________|_______|
]
features_gardées = [
'People using at least basic drinking water services',
'Income composition of resources',
'Schooling',
' HIV/AIDS',
'GDP per capita',
' thinness 5-9 years',
'Status_Developed',
'Health expenditure',
'GDP'
]
df_clean = df[list(set(['Life Expectancy']+features_gardées)-{'Status_Developed'})].applymap(lambda x: x if x else np.NaN)
df_clean['Status_Developed'] = df['Status'].apply(lambda x: 1 if x == 'Developed' else 0)
df_clean.sample(3)
| Schooling | HIV/AIDS | GDP per capita | Health expenditure | Life Expectancy | thinness 5-9 years | Income composition of resources | People using at least basic drinking water services | GDP | Status_Developed | |
|---|---|---|---|---|---|---|---|---|---|---|
| 119 | 14.4 | 0.1 | 47893.28238 | 3.285104 | 75.770000 | 5.9 | 0.815 | 99.969586 | 2367.565350 | 0 |
| 1108 | 15.1 | 0.1 | 14472.48482 | 7.966922 | 75.982927 | 2.0 | 0.792 | 96.409098 | 7318.742449 | 0 |
| 1217 | 19.2 | 0.1 | 37293.35679 | 9.401684 | 81.404878 | 0.3 | 0.910 | 100.000003 | 4453.246730 | 1 |
fig, ((ax1,ax2,ax3),(ax4,ax5,ax6),(ax7,ax8,ax9)) = plt.subplots(figsize=(20,12),nrows=3, ncols=3, sharey=True)
axes = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9]
for i, column in enumerate(df_clean[list(set(df_clean.columns.values)-{'Life Expectancy'})].columns.values.tolist()):
sns.scatterplot(df_clean[column], df['Life Expectancy'], ax=axes[i])
axes[i].set_xlabel(column, fontdict=fontlabel)
axes[i].set_ylabel('Life Expectancy', fontdict=fontlabel)
fig.suptitle("L'espérance de vie en fonction des features potentielles du modèle", size=20, weight='bold', y=0.91);
class ModeleLineaire():
def __init__(self, X, y,
num_imputer=SimpleImputer(strategy='median'),
num_transformer=MinMaxScaler(),
cat_transformer=OneHotEncoder(handle_unknown='ignore', drop='first'),
test_size=0.3):
self.X = X
self.y = y
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size)
num_cols = X._get_numeric_data().columns.values.tolist()
cat_cols = list(set(X.columns)-set(X._get_numeric_data().columns))
num_pipeline = Pipeline([
('imputer', num_imputer),
('scaler', num_transformer)])
self.preprocessor = ColumnTransformer([
('num_transformer', num_pipeline, num_cols),
('cat_transformer', cat_transformer, cat_cols)])
self.pipeline = make_pipeline(self.preprocessor, LinearRegression())
self.pipe = self.pipeline.fit(self.X_train, self.y_train)
def show_scores(self, cv=5, scoring='r2'):
for i, r2 in enumerate(cross_val_score(self.pipe, self.X_train, self.y_train, cv=cv, scoring=scoring)):
(R2 := np.array(r2))
print(f' {i+1}. r2 = {round(r2,2)}')
print('Moyenne des r2 :', round(R2.mean(),2))
print('Score du test :', round(self.pipe.score(self.X_test, self.y_test),2))
def finalize(self): return self.pipeline.fit(self.X, self.y)
def tts(self): return self.X_train, self.X_test, self.y_train, self.y_test
def prediction(self, X=0):
if type(X) == int: return self.pipe.predict(self.X_test)
return self.pipe.predict(X)
def residuals(self): return self.prediction() - self.y_test
def het(self): return sms.het_breuschpagan(self.residuals(), self.preprocessor.fit_transform(self.X_test))
def plot(self):
def test_homo():
names = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test = self.het()
p_value = lzip(names, test)[1][1]
if p_value > 0.05: return f"p-value = {round(p_value,2)} > 0.05 :\nNous ne pouvons pas rejeter l'hypothèse nulle.\nC'est homoscédastique"
return f"p-value = {round(p_value,2)} <= 0.05 :\nNous pouvons rejeter l'hyptohèse nulle.\nC'est hétéroscédastique"
def remove_spine(axe):
axe.spines.right.set_visible(False)
axe.spines.top.set_visible(False)
residus = self.residuals()
fontdict = {'fontsize': '16',
'fontweight': 'bold',
'color': "black"}
fig, ((ax1,ax2),
(ax3,ax4)) = plt.subplots(figsize=(15, 10),nrows=2, ncols=2)
ax1.set_title('Condition de normalité des erreurs', fontdict)
sns.histplot(residus, ax = ax1, kde = True)
ax1.set_xlabel('Résidus')
remove_spine(ax1)
ax2.set_title("Visualisation de l'homoscedasticité", fontdict)
ax2.scatter(self.y_test, residus, c="red")
ax2.plot(np.array(range(40,90)),np.zeros(50), c="black")
ax2.set_xlabel('y observé')
ax2.set_ylabel('Résidus')
remove_spine(ax2)
sentence = test_homo()
ax2.annotate(sentence,ha='left',fontsize=10,fontstyle='italic', xy=(75, 4),
xytext=(65, 9),color='black',fontweight="extra bold", bbox=dict(color="white"),
arrowprops=dict(color="#1d2d35",linewidth="2",arrowstyle="->", connectionstyle="angle3"))
ax3.set_title('QQ plot', fontdict)
sm.qqplot(residus, line='s', ax= ax3)
remove_spine(ax3)
ax4.set_visible(False)
plt.show()
plt.figure(figsize=(16,8))
sns.regplot(df['People using at least basic drinking water services'], y, color='C1', label='Regression linéaire')
sns.scatterplot(df['People using at least basic drinking water services'], y)
plt.xlabel('People using at least basic drinking water services', fontdict=fontlabel)
plt.ylabel('Life Expectancy', fontdict=fontlabel)
plt.legend(fontsize=15, loc='lower right')
plt.title("L'espérance de vie en fonction du pourcentage de personnes utilisant un service d'eau potable", fontdict=fonttitle);
X1 = df_clean[['People using at least basic drinking water services']]
model1 = ModeleLineaire(X1, y)
model1.show_scores()
1. r2 = 0.66 2. r2 = 0.65 3. r2 = 0.68 4. r2 = 0.71 5. r2 = 0.68 Moyenne des r2 : 0.68 Score du test : 0.67
feature2 = df_clean['Income composition of resources']
plt.figure(figsize=(16,8))
sns.regplot(feature2, y, color='C1', label='Regression linéaire')
sns.scatterplot(feature2, y)
plt.xlabel('Income composition of resources', fontdict=fontlabel)
plt.ylabel('Life Expectancy', fontdict=fontlabel)
plt.legend(fontsize=15, loc='lower right')
plt.title("L'espérance de vie en fonction de l'IDH en terme de composition des revenus des ressources", fontdict=fonttitle);
X2 = pd.concat([X1, feature2], axis=1)
model2 = ModeleLineaire(X2, y)
model2.show_scores()
1. r2 = 0.76 2. r2 = 0.76 3. r2 = 0.71 4. r2 = 0.77 5. r2 = 0.71 Moyenne des r2 : 0.71 Score du test : 0.76
fig = px.scatter_3d(pd.concat([X2, y], axis=1),
x='People using at least basic drinking water services',
y='Income composition of resources',
z='Life Expectancy',
color='Life Expectancy',
title="L'espérance de vie en fonction des deux premières features",
color_continuous_scale=px.colors.diverging.BrBG,
opacity=0.7)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=60))
feature3 = df_clean['Schooling']
plt.figure(figsize=(16,8))
sns.regplot(feature3, y, color='C1', label='Regression linéaire')
sns.scatterplot(feature3, y)
plt.xlabel('Schooling', fontdict=fontlabel)
plt.ylabel('Life Expectancy', fontdict=fontlabel)
plt.legend(fontsize=15, loc='lower right')
plt.title("L'espérance de vie en fonction du nombre moyen d'années d'école", fontdict=fonttitle);
X3 = pd.concat([X2, feature3], axis=1)
model3 = ModeleLineaire(X3, y)
model3.show_scores()
1. r2 = 0.76 2. r2 = 0.75 3. r2 = 0.72 4. r2 = 0.72 5. r2 = 0.81 Moyenne des r2 : 0.81 Score du test : 0.74
feature4 = df_clean['GDP per capita']
plt.figure(figsize=(16,8))
sns.regplot(feature4, y, color='C1', logx=True, label='Regression linéaire')
sns.scatterplot(feature4, y)
plt.xlabel('GDP per capita', fontdict=fontlabel)
plt.ylabel('Life Expectancy', fontdict=fontlabel)
plt.legend(fontsize=15, loc='lower right')
plt.title("L'espérance de vie en fonction du PIB par habitant", fontdict=fonttitle);
X4 = pd.concat([X2, feature4.apply(lambda x: np.log(x))], axis=1)
model4 = ModeleLineaire(X4, y)
model4.show_scores()
1. r2 = 0.72 2. r2 = 0.74 3. r2 = 0.74 4. r2 = 0.78 5. r2 = 0.76 Moyenne des r2 : 0.76 Score du test : 0.76
feature5 = df_clean[' HIV/AIDS']
plt.figure(figsize=(16,8))
sns.regplot(feature5, y, color='C1', logx=True, label='Regression linéaire')
sns.scatterplot(feature5, y)
plt.xlabel('HIV/AIDS', fontdict=fontlabel)
plt.ylabel('Life Expectancy', fontdict=fontlabel)
plt.legend(fontsize=15, loc='lower right')
plt.title("L'espérance de vie en du nombre de personnes atteintes du sida par 1000 habitants", fontdict=fonttitle);
X5 = pd.concat([X2, feature5.apply(lambda x: np.log(x))], axis=1).rename({' HIV/AIDS':'HIV/AIDS'}, axis=1)
model5 = ModeleLineaire(X5, y)
model5.show_scores()
1. r2 = 0.86 2. r2 = 0.88 3. r2 = 0.87 4. r2 = 0.86 5. r2 = 0.85 Moyenne des r2 : 0.85 Score du test : 0.87
fig = px.scatter_3d(pd.concat([X5, y], axis=1),
x='People using at least basic drinking water services',
y='HIV/AIDS',
z='Life Expectancy',
color='Income composition of resources',
title="L'espérance de vie en fonction des trois premières features",
color_continuous_scale=px.colors.diverging.BrBG,
opacity=0.7)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=60))
model5.plot()
final_model = model5.finalize()
pickle.dump(final_model, open('life_expectancy_model.pkl', 'wb'))
from typing import Union
from fastapi import FastAPI
from pandas import DataFrame
from numpy import log
from pickle import Unpickler
app = FastAPI()
pickled_model = Unpickler(open('life_expectancy_model.pkl', 'rb')).load()
@app.get("/")
def read_root(drink: Union[float, None] = None,
income: Union[float, None] = None,
hiv: Union[float, None] = None):
X = DataFrame({'People using at least basic drinking water services' : [drink],
'Income composition of resources' : [income],
'HIV/AIDS' : [log(hiv)]})
pred = pickled_model.predict(X)[0]
return {'prediction' : round(pred)}